{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "61e0d688",
   "metadata": {},
   "source": [
    "### A toy example on how to build a narrative model and use it to extract narratives from text.\n",
    "\n",
    "This example is based on 100 sentences from the US Congressional Record. \n",
    "\n",
    "In practice, semantic role labeling is time-consuming, but training and prediction is relatively fast. \n",
    "\n",
    "Of course, given the number of observations, results will not make sense.\n",
    "\n",
    "For more detailed examples with recent versions of the package **relatio**, please see our repository: https://github.com/relatio-nlp/relatio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7ccac297",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the sentences.\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "split_sentences = pd.read_csv('../data/sample_of_speeches.csv').iloc[0:100]\n",
    "\n",
    "split_sentences = (list(split_sentences['doc']), list(split_sentences['sentence']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4aa2beb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-10-27 17:48:27.514452: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running SRL...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████| 100/100 [00:59<00:00,  1.69it/s]\n"
     ]
    }
   ],
   "source": [
    "# Run semantic role labeling.\n",
    "\n",
    "from narrativeNLP.wrappers import run_srl\n",
    "\n",
    "srl_res = run_srl(\n",
    "    path=\"https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz\", # pre-trained model\n",
    "    sentences=split_sentences[1],\n",
    "    progress_bar=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "682a2327",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building narrative model...\n",
      "About to extract roles ...\n",
      "Dealing with named entities...\n",
      "Save to disk is None; creating entities in memory...\n",
      "About to pick top entities...\n",
      "Top entities were picked...\n",
      "About to map entities...\n",
      "Entities mapped, about to append to narrative model...\n",
      "Entities appended to model.\n",
      "About to deal with roles with embeddings...\n",
      "Loading embeddings model (gensim keyed)...\n",
      "About to get first vectors...\n",
      "Getting vector for 0...\n",
      "Vector done for 0.\n",
      "Processing 5 clusters...\n",
      "Save to disk is None. Training to get kmeans_0_5.pk\n",
      "Initialization complete\n",
      "Iteration 0, inertia 172.43496704101562\n",
      "Iteration 1, inertia 109.03441619873047\n",
      "Iteration 2, inertia 107.99320220947266\n",
      "Iteration 3, inertia 107.4124984741211\n",
      "Iteration 4, inertia 107.01451873779297\n",
      "Iteration 5, inertia 106.75529479980469\n",
      "Iteration 6, inertia 106.52215576171875\n",
      "Converged at iteration 6: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 172.7506866455078\n",
      "Iteration 1, inertia 108.62010192871094\n",
      "Iteration 2, inertia 107.12406921386719\n",
      "Iteration 3, inertia 106.52762603759766\n",
      "Iteration 4, inertia 106.20375061035156\n",
      "Iteration 5, inertia 106.04893493652344\n",
      "Converged at iteration 5: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 177.14295959472656\n",
      "Iteration 1, inertia 110.56100463867188\n",
      "Iteration 2, inertia 108.146240234375\n",
      "Iteration 3, inertia 107.36211395263672\n",
      "Iteration 4, inertia 107.25647735595703\n",
      "Iteration 5, inertia 107.22412109375\n",
      "Converged at iteration 5: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 171.1577606201172\n",
      "Iteration 1, inertia 109.03691864013672\n",
      "Iteration 2, inertia 107.37687683105469\n",
      "Iteration 3, inertia 106.8647232055664\n",
      "Iteration 4, inertia 106.71570587158203\n",
      "Iteration 5, inertia 106.69319152832031\n",
      "Iteration 6, inertia 106.6710433959961\n",
      "Converged at iteration 6: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 170.2021026611328\n",
      "Iteration 1, inertia 108.98085021972656\n",
      "Iteration 2, inertia 107.66226196289062\n",
      "Iteration 3, inertia 107.19783782958984\n",
      "Iteration 4, inertia 106.81069946289062\n",
      "Iteration 5, inertia 106.6785659790039\n",
      "Iteration 6, inertia 106.64326477050781\n",
      "Iteration 7, inertia 106.61671447753906\n",
      "Converged at iteration 7: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 175.2831268310547\n",
      "Iteration 1, inertia 108.50312042236328\n",
      "Iteration 2, inertia 107.91059875488281\n",
      "Iteration 3, inertia 107.74342346191406\n",
      "Iteration 4, inertia 107.63972473144531\n",
      "Iteration 5, inertia 107.48651123046875\n",
      "Converged at iteration 5: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 189.32794189453125\n",
      "Iteration 1, inertia 109.11100006103516\n",
      "Iteration 2, inertia 107.57491302490234\n",
      "Iteration 3, inertia 107.13165283203125\n",
      "Iteration 4, inertia 107.03890228271484\n",
      "Converged at iteration 4: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 173.4180450439453\n",
      "Iteration 1, inertia 110.04821014404297\n",
      "Iteration 2, inertia 108.87590789794922\n",
      "Iteration 3, inertia 108.65190124511719\n",
      "Iteration 4, inertia 108.51531219482422\n",
      "Iteration 5, inertia 108.38279724121094\n",
      "Iteration 6, inertia 108.34650421142578\n",
      "Iteration 7, inertia 108.22686767578125\n",
      "Iteration 8, inertia 108.10074615478516\n",
      "Iteration 9, inertia 108.08033752441406\n",
      "Converged at iteration 9: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 185.90255737304688\n",
      "Iteration 1, inertia 111.14868927001953\n",
      "Iteration 2, inertia 108.48546600341797\n",
      "Iteration 3, inertia 106.5753402709961\n",
      "Iteration 4, inertia 105.48455047607422\n",
      "Iteration 5, inertia 105.29712677001953\n",
      "Iteration 6, inertia 105.21985626220703\n",
      "Iteration 7, inertia 105.17129516601562\n",
      "Converged at iteration 7: strict convergence.\n",
      "Initialization complete\n",
      "Iteration 0, inertia 174.71214294433594\n",
      "Iteration 1, inertia 108.4140396118164\n",
      "Iteration 2, inertia 107.37141418457031\n",
      "Iteration 3, inertia 107.03524017333984\n",
      "Iteration 4, inertia 106.9077377319336\n",
      "Iteration 5, inertia 106.85490417480469\n",
      "Converged at iteration 5: strict convergence.\n",
      "Training concluded: kmeans_0_5.pk\n",
      "Getting clusters for 5...\n",
      "Labeling clusters for 5...\n"
     ]
    }
   ],
   "source": [
    "# Build the narrative model keeping the top 5 named entities and assuming 5 additional unknown entities.\n",
    "\n",
    "from narrativeNLP.wrappers import build_narrative_model\n",
    "\n",
    "print('Building narrative model...')\n",
    "\n",
    "narrative_model = build_narrative_model(srl_res = srl_res,\n",
    "                                        sentences = split_sentences[1], \n",
    "                                        roles_considered = ['ARGO', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1','ARG2'],\n",
    "                                        roles_with_embeddings = [['ARGO','ARG1','ARG2']],\n",
    "                                        embeddings_type = 'gensim_keyed_vectors',\n",
    "                                        embeddings_path = 'glove-wiki-gigaword-300',\n",
    "                                        n_clusters = [[5]],\n",
    "                                        verbose = 1,\n",
    "                                        roles_with_entities = ['ARGO','ARG1','ARG2'],\n",
    "                                        top_n_entities = 5,\n",
    "                                        dimension_reduce_verbs = False,\n",
    "                                        save_to_disk = None,\n",
    "                                        max_length = 4,\n",
    "                                        remove_punctuation = True,\n",
    "                                        remove_digits = True,\n",
    "                                        remove_chars = '',\n",
    "                                        stop_words = [],\n",
    "                                        lowercase = True,\n",
    "                                        strip = True,\n",
    "                                        remove_whitespaces = True,\n",
    "                                        lemmatize = True,\n",
    "                                        stem = False,\n",
    "                                        tags_to_keep = None,\n",
    "                                        remove_n_letter_words = 1,\n",
    "                                        progress_bar = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8a3de19d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing SRL...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████| 100/100 [00:00<00:00, 11513.64it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cleaning SRL...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 378/378 [00:00<00:00, 1400.93it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing raw arguments...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████| 378/378 [00:00<00:00, 514087.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mapping named entities...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████| 378/378 [00:00<00:00, 176730.23it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assigning clusters to roles...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 378/378 [00:00<00:00, 4298.34it/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ARGO-RAW</th>\n",
       "      <th>ARG1-RAW</th>\n",
       "      <th>B-V-RAW</th>\n",
       "      <th>sentence</th>\n",
       "      <th>doc</th>\n",
       "      <th>ARG2-RAW</th>\n",
       "      <th>ARG2</th>\n",
       "      <th>ARGO</th>\n",
       "      <th>ARG1</th>\n",
       "      <th>B-ARGM-MOD-RAW</th>\n",
       "      <th>B-ARGM-NEG-RAW</th>\n",
       "      <th>statement</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>ask</td>\n",
       "      <td>0</td>\n",
       "      <td>./gpo_sentences/2006-03-06_1169887.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>be</td>\n",
       "      <td>0</td>\n",
       "      <td>./gpo_sentences/2006-03-06_1169887.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>authorize</td>\n",
       "      <td>0</td>\n",
       "      <td>./gpo_sentences/2006-03-06_1169887.csv</td>\n",
       "      <td>the committee on</td>\n",
       "      <td>the independent counsel</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>the committee on finance</td>\n",
       "      <td>unanimous consent that</td>\n",
       "      <td>meet</td>\n",
       "      <td>0</td>\n",
       "      <td>./gpo_sentences/2006-03-06_1169887.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>the independent counsel</td>\n",
       "      <td>their measure</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>hear</td>\n",
       "      <td>1</td>\n",
       "      <td>./gpo_sentences/2006-03-06_1169887.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>373</th>\n",
       "      <td>allow me</td>\n",
       "      <td></td>\n",
       "      <td>summarize</td>\n",
       "      <td>98</td>\n",
       "      <td>./gpo_sentences/1994-06-21_15547.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>we</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>374</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>be</td>\n",
       "      <td>98</td>\n",
       "      <td>./gpo_sentences/1994-06-21_15547.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>374</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>375</th>\n",
       "      <td></td>\n",
       "      <td>change</td>\n",
       "      <td>need</td>\n",
       "      <td>98</td>\n",
       "      <td>./gpo_sentences/1994-06-21_15547.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>cost of living adjustment</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>376</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>fall</td>\n",
       "      <td>98</td>\n",
       "      <td>./gpo_sentences/1994-06-21_15547.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>377</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>99</td>\n",
       "      <td>./gpo_sentences/1994-06-21_15547.csv</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>377</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>378 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     ARGO-RAW                ARG1-RAW    B-V-RAW  sentence  \\\n",
       "0                                                            ask         0   \n",
       "1                                                             be         0   \n",
       "2                                                      authorize         0   \n",
       "3    the committee on finance  unanimous consent that       meet         0   \n",
       "4                                                           hear         1   \n",
       "..                        ...                     ...        ...       ...   \n",
       "373                  allow me                          summarize        98   \n",
       "374                                                           be        98   \n",
       "375                                            change       need        98   \n",
       "376                                                         fall        98   \n",
       "377                                                                     99   \n",
       "\n",
       "                                        doc          ARG2-RAW  \\\n",
       "0    ./gpo_sentences/2006-03-06_1169887.csv                     \n",
       "1    ./gpo_sentences/2006-03-06_1169887.csv                     \n",
       "2    ./gpo_sentences/2006-03-06_1169887.csv  the committee on   \n",
       "3    ./gpo_sentences/2006-03-06_1169887.csv                     \n",
       "4    ./gpo_sentences/2006-03-06_1169887.csv                     \n",
       "..                                      ...               ...   \n",
       "373    ./gpo_sentences/1994-06-21_15547.csv                     \n",
       "374    ./gpo_sentences/1994-06-21_15547.csv                     \n",
       "375    ./gpo_sentences/1994-06-21_15547.csv                     \n",
       "376    ./gpo_sentences/1994-06-21_15547.csv                     \n",
       "377    ./gpo_sentences/1994-06-21_15547.csv                     \n",
       "\n",
       "                        ARG2                     ARGO  \\\n",
       "0                                                       \n",
       "1                                                       \n",
       "2    the independent counsel                            \n",
       "3                             the independent counsel   \n",
       "4                                                       \n",
       "..                       ...                      ...   \n",
       "373                                                we   \n",
       "374                                                     \n",
       "375                                                     \n",
       "376                                                     \n",
       "377                                                     \n",
       "\n",
       "                          ARG1 B-ARGM-MOD-RAW B-ARGM-NEG-RAW  statement  \n",
       "0                                                                     0  \n",
       "1                                                                     1  \n",
       "2                                                                     2  \n",
       "3                their measure                                        3  \n",
       "4                                                                     4  \n",
       "..                         ...            ...            ...        ...  \n",
       "373                                                                 373  \n",
       "374                                                                 374  \n",
       "375  cost of living adjustment                                      375  \n",
       "376                                                                 376  \n",
       "377                                                                 377  \n",
       "\n",
       "[378 rows x 12 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Predict some narratives using the model.\n",
    "\n",
    "from narrativeNLP.wrappers import get_narratives\n",
    "\n",
    "final_statements = get_narratives(\n",
    "    srl_res=srl_res,\n",
    "    doc_index=split_sentences[0],\n",
    "    narrative_model=narrative_model,\n",
    "    save_to_disk=None,\n",
    "    save_postproc_roles=None,\n",
    "    save_raw_roles=None,\n",
    "    n_clusters=[0],\n",
    "    cluster_labeling='most_frequent',\n",
    "    progress_bar=True,\n",
    "    )\n",
    "\n",
    "final_statements"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
